{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Recommendation Systems using TensorFlow\n",
    "\n",
    "### Builds a recommendation engine using Movie Lens data \n",
    "We use the 1M data set for building our recommendation engine.\n",
    "The 1M data contain 1,000,209 anonymous ratings \n",
    "- Ratings for approximately *3,900 movies* \n",
    "- Ratings provided by *6,040 MovieLens users* who joined MovieLens in 2000\n",
    "\n",
    "More information about the data can be viewed at: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Imports for data io operations\n",
    "from collections import deque\n",
    "from six import next\n",
    "import readers\n",
    "\n",
    "# Main imports for training\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "\n",
    "# Evaluate train times per epoch\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Constant seed for replicating training results\n",
    "np.random.seed(42)\n",
    "\n",
    "u_num = 6040 # Number of users in the dataset\n",
    "i_num = 3952 # Number of movies in the dataset\n",
    "\n",
    "batch_size = 1000 # Number of samples per batch\n",
    "dims = 5          # Dimensions of the data, 15\n",
    "max_epochs = 50   # Number of times the network sees all the training data\n",
    "\n",
    "# Device used for all computations\n",
    "place_device = \"/cpu:0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_data():\n",
    "    # Reads file using the demiliter :: form the ratings file\n",
    "    # Download movie lens data from: http://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
    "    # Columns are user ID, item ID, rating, and timestamp\n",
    "    # Sample data - 3::1196::4::978297539\n",
    "    df = readers.read_file(\"ratings.dat\", sep=\"::\")\n",
    "    rows = len(df)\n",
    "    # Purely integer-location based indexing for selection by position\n",
    "    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)\n",
    "    # Separate data into train and test, 90% for train and 10% for test\n",
    "    split_index = int(rows * 0.9)\n",
    "    # Use indices to separate the data\n",
    "    df_train = df[0:split_index]\n",
    "    df_test = df[split_index:].reset_index(drop=True)\n",
    "    \n",
    "    return df_train, df_test\n",
    "\n",
    "def clip(x):\n",
    "    return np.clip(x, 1.0, 5.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def model(user_batch, item_batch, user_num, item_num, dim=5, device=\"/cpu:0\"):\n",
    "    with tf.device(\"/cpu:0\"):\n",
    "        # Using a global bias term\n",
    "        bias_global = tf.get_variable(\"bias_global\", shape=[])\n",
    "        # User and item bias variables\n",
    "        # get_variable: Prefixes the name with the current variable scope \n",
    "        # and performs reuse checks.\n",
    "        w_bias_user = tf.get_variable(\"embd_bias_user\", shape=[user_num])\n",
    "        w_bias_item = tf.get_variable(\"embd_bias_item\", shape=[item_num])\n",
    "        # embedding_lookup: Looks up 'ids' in a list of embedding tensors\n",
    "        # Bias embeddings for user and items, given a batch\n",
    "        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name=\"bias_user\")\n",
    "        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name=\"bias_item\")\n",
    "        # User and item weight variables\n",
    "        w_user = tf.get_variable(\"embd_user\", shape=[user_num, dim],\n",
    "                                 initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "        w_item = tf.get_variable(\"embd_item\", shape=[item_num, dim],\n",
    "                                 initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "        # Weight embeddings for user and items, given a batch\n",
    "        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name=\"embedding_user\")\n",
    "        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name=\"embedding_item\")\n",
    "    \n",
    "    with tf.device(device):\n",
    "        # reduce_sum: Computes the sum of elements across dimensions of a tensor\n",
    "        infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1)\n",
    "        infer = tf.add(infer, bias_global)\n",
    "        infer = tf.add(infer, bias_user)\n",
    "        infer = tf.add(infer, bias_item, name=\"svd_inference\")\n",
    "        # l2_loss: Computes half the L2 norm of a tensor without the sqrt\n",
    "        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), \n",
    "                             name=\"svd_regularizer\")\n",
    "    return infer, regularizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loss(infer, regularizer, rate_batch, learning_rate=0.1, reg=0.1, device=\"/cpu:0\"):\n",
    "    with tf.device(device):\n",
    "        # Use L2 loss to compute penalty\n",
    "        cost_l2 = tf.nn.l2_loss(tf.sub(infer, rate_batch))\n",
    "        penalty = tf.constant(reg, dtype=tf.float32, shape=[], name=\"l2\")\n",
    "        cost = tf.add(cost_l2, tf.mul(regularizer, penalty))\n",
    "        # 'Follow the Regularized Leader' optimizer\n",
    "        # Reference: http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf\n",
    "        train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost)\n",
    "    return cost, train_op"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of train samples 900188, test samples 100021, samples per batch 900\n"
     ]
    }
   ],
   "source": [
    "# Read data from ratings file to build a TF model\n",
    "df_train, df_test = get_data()\n",
    "\n",
    "samples_per_batch = len(df_train) // batch_size\n",
    "print(\"Number of train samples %d, test samples %d, samples per batch %d\" % \n",
    "      (len(df_train), len(df_test), samples_per_batch))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    5411\n",
      "1    5439\n",
      "2     367\n",
      "3     424\n",
      "4    4941\n",
      "Name: user, dtype: int32\n",
      "0    1696\n",
      "1    5448\n",
      "2    2242\n",
      "3    5629\n",
      "4     423\n",
      "Name: user, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 user values\n",
    "print(df_train[\"user\"].head()) \n",
    "print(df_test[\"user\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2682\n",
      "1     903\n",
      "2    3716\n",
      "3    1720\n",
      "4    3696\n",
      "Name: item, dtype: int32\n",
      "0    3113\n",
      "1    1195\n",
      "2     749\n",
      "3    3623\n",
      "4    2899\n",
      "Name: item, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 item values\n",
    "print(df_train[\"item\"].head())\n",
    "print(df_test[\"item\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2.0\n",
      "1    5.0\n",
      "2    4.0\n",
      "3    4.0\n",
      "4    1.0\n",
      "Name: rate, dtype: float32\n",
      "0    5.0\n",
      "1    5.0\n",
      "2    5.0\n",
      "3    2.0\n",
      "4    2.0\n",
      "Name: rate, dtype: float32\n"
     ]
    }
   ],
   "source": [
    "# Peeking at the top 5 rate values\n",
    "print(df_train[\"rate\"].head())\n",
    "print(df_test[\"rate\"].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Using a shuffle iterator to generate random batches, for training\n",
    "iter_train = readers.ShuffleIterator([df_train[\"user\"],\n",
    "                                     df_train[\"item\"],\n",
    "                                     df_train[\"rate\"]],\n",
    "                                     batch_size=batch_size)\n",
    "\n",
    "# Sequentially generate one-epoch batches, for testing\n",
    "iter_test = readers.OneEpochIterator([df_test[\"user\"],\n",
    "                                     df_test[\"item\"],\n",
    "                                     df_test[\"rate\"]],\n",
    "                                     batch_size=-1)\n",
    "\n",
    "user_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_user\")\n",
    "item_batch = tf.placeholder(tf.int32, shape=[None], name=\"id_item\")\n",
    "rate_batch = tf.placeholder(tf.float32, shape=[None])\n",
    "\n",
    "infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)\n",
    "_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.10, reg=0.05, device=place_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch\tTrain Error\tVal Error\tElapsed Time\n",
      "00\t2.805\t\t2.793\t\t0.030 secs\n",
      "01\t1.304\t\t0.981\t\t1.585 secs\n",
      "02\t0.941\t\t0.933\t\t1.582 secs\n",
      "03\t0.907\t\t0.913\t\t1.652 secs\n",
      "04\t0.891\t\t0.903\t\t1.560 secs\n",
      "05\t0.882\t\t0.897\t\t1.756 secs\n",
      "06\t0.876\t\t0.893\t\t1.606 secs\n",
      "07\t0.872\t\t0.890\t\t1.553 secs\n",
      "08\t0.869\t\t0.888\t\t1.553 secs\n",
      "09\t0.866\t\t0.886\t\t1.686 secs\n",
      "10\t0.864\t\t0.884\t\t1.694 secs\n",
      "11\t0.861\t\t0.883\t\t1.610 secs\n",
      "12\t0.859\t\t0.882\t\t1.724 secs\n",
      "13\t0.857\t\t0.881\t\t1.563 secs\n",
      "14\t0.857\t\t0.880\t\t1.685 secs\n",
      "15\t0.855\t\t0.879\t\t1.683 secs\n",
      "16\t0.853\t\t0.878\t\t1.612 secs\n",
      "17\t0.851\t\t0.876\t\t1.676 secs\n",
      "18\t0.851\t\t0.876\t\t1.699 secs\n",
      "19\t0.849\t\t0.875\t\t1.623 secs\n",
      "20\t0.846\t\t0.874\t\t1.683 secs\n",
      "21\t0.845\t\t0.873\t\t1.726 secs\n",
      "22\t0.843\t\t0.872\t\t1.600 secs\n",
      "23\t0.843\t\t0.871\t\t1.559 secs\n",
      "24\t0.842\t\t0.871\t\t1.610 secs\n",
      "25\t0.841\t\t0.870\t\t1.579 secs\n",
      "26\t0.839\t\t0.869\t\t1.596 secs\n",
      "27\t0.840\t\t0.868\t\t1.578 secs\n",
      "28\t0.838\t\t0.868\t\t1.763 secs\n",
      "29\t0.837\t\t0.868\t\t1.557 secs\n",
      "30\t0.836\t\t0.868\t\t1.660 secs\n",
      "31\t0.836\t\t0.867\t\t1.805 secs\n",
      "32\t0.834\t\t0.867\t\t1.815 secs\n",
      "33\t0.833\t\t0.866\t\t1.687 secs\n",
      "34\t0.832\t\t0.866\t\t1.662 secs\n",
      "35\t0.832\t\t0.866\t\t1.724 secs\n",
      "36\t0.832\t\t0.865\t\t1.563 secs\n",
      "37\t0.832\t\t0.865\t\t1.565 secs\n",
      "38\t0.831\t\t0.865\t\t1.557 secs\n",
      "39\t0.830\t\t0.864\t\t1.562 secs\n",
      "40\t0.828\t\t0.864\t\t1.730 secs\n",
      "41\t0.829\t\t0.864\t\t2.110 secs\n",
      "42\t0.829\t\t0.864\t\t1.669 secs\n",
      "43\t0.828\t\t0.863\t\t1.820 secs\n",
      "44\t0.827\t\t0.863\t\t2.120 secs\n",
      "45\t0.827\t\t0.863\t\t1.981 secs\n",
      "46\t0.826\t\t0.862\t\t1.638 secs\n",
      "47\t0.826\t\t0.863\t\t1.675 secs\n",
      "48\t0.827\t\t0.863\t\t1.702 secs\n",
      "49\t0.825\t\t0.862\t\t1.668 secs\n"
     ]
    }
   ],
   "source": [
    "saver = tf.train.Saver()\n",
    "init_op = tf.global_variables_initializer()\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    sess.run(init_op)\n",
    "    print(\"%s\\t%s\\t%s\\t%s\" % (\"Epoch\", \"Train Error\", \"Val Error\", \"Elapsed Time\"))\n",
    "    errors = deque(maxlen=samples_per_batch)\n",
    "    start = time.time()\n",
    "    for i in range(max_epochs * samples_per_batch):\n",
    "        users, items, rates = next(iter_train)\n",
    "        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,\n",
    "                                                               item_batch: items,\n",
    "                                                               rate_batch: rates})\n",
    "        pred_batch = clip(pred_batch)\n",
    "        errors.append(np.power(pred_batch - rates, 2))\n",
    "        if i % samples_per_batch == 0:\n",
    "            train_err = np.sqrt(np.mean(errors))\n",
    "            test_err2 = np.array([])\n",
    "            for users, items, rates in iter_test:\n",
    "                pred_batch = sess.run(infer, feed_dict={user_batch: users,\n",
    "                                                        item_batch: items})\n",
    "                pred_batch = clip(pred_batch)\n",
    "                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))\n",
    "            end = time.time()\n",
    "            \n",
    "            print(\"%02d\\t%.3f\\t\\t%.3f\\t\\t%.3f secs\" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))\n",
    "            start = end\n",
    "\n",
    "    saver.save(sess, './save/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![alt text](TrainValError.png \"Train / Validation Error vs. Epoch\")"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
